#%%
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
这个module是对神经网络的定义，并将定义的输入tensor（即 placeholder）和输出tensor返回给主调用程序，
在主调用程序里面通过 feed_dict 字典对输入向量（placeholder）进行赋值，便可以对输出tensor进行sess.run求解
"""

import tensorflow as tf

# hyperparameters
# 学习率可以随意调整来调整训练的速度和稳定性
learning_rate = 1e-1 # feel free to play with this to train faster or more stably.
D = 4   # 输入向量维度 input dimensionality
H = 50  # 隐藏层维度 number of hidden layer neurons

# 初始化TensorFlow的默认图
tf.reset_default_graph()

def get_action_p( observations ):     # 根据策略网络计算出选动作input_0的概率probability, input_1的概率为1-probability
    # 传入的为tensor或placeholder
    # This defines the network as it goes from taking an observation of the environment to
    # giving a probability of chosing to the action of moving left or right.
    # observations 是传入的观测到的状态tensor

    W1 = tf.get_variable("W1", shape=[D, H],
                         initializer=tf.contrib.layers.xavier_initializer())
    layer1 = tf.nn.relu(tf.matmul(observations, W1))
    W2 = tf.get_variable("W2", shape=[H, 1],
                         initializer=tf.contrib.layers.xavier_initializer())
    score = tf.matmul(layer1, W2)
    probability = tf.nn.sigmoid(score)
    """ probability为选择input_y=0 的概率， input_y=1的选择概率为 1-probability """
    return probability


def get_loss(advantages, input_y, probability):
    # 传入的为tensor或placeholder
    # The loss function. This sends the weights in the direction of making actions
    # that gave good advantage (reward over time) more likely, and actions that didn't less likely.
    # 动作input_0的概率probability, input_1的概率为1-probability
    # 动作0的input_y=0, 动作1的input_y=1
    # advantages 为选择动作input_y后所得到的累计折扣奖励
    loglik = tf.log(input_y * (input_y - probability) + (1 - input_y) * (input_y + probability))
    loss = -tf.reduce_mean(loglik * advantages)
    # loss为一个episode内所有observation,input_y,advantages得到的损失的均值，reduce_mean
    return loss


def get_grad(loss):
    tvars = tf.trainable_variables()     #获得图内需要训练的 variables
    newGrads = tf.gradients(loss, tvars)  # 获得loss对于图内需要训练的 variables 的梯度
    return newGrads


def opteration_updata_para(batchGrad):
    tvars = tf.trainable_variables()     #获得图内需要训练的 variables
    adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
    # 定义对参数 tvars 进行梯度更新的操作, 使用adam优化器对参数进行更新
    opt_updateGrads = adam.apply_gradients(zip(batchGrad, tvars))
    return opt_updateGrads


#############################################################


#From here we define the parts of the network needed for learning a good policy.
observations = tf.placeholder(tf.float32, [None, D], name="input_x")
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32, [None, 1], name="reward_signal")

# 定义执行动作action_1 的概率
probability = get_action_p( observations )
# 定义 累积回报为advantages,动作为input_y, 神经网络对应observations输出action_1的概率为probability
loss = get_loss(advantages, input_y, probability)
# 返回神经网络各参数对应loss的梯度
newGrads = get_grad(loss)


# Once we have collected a series of gradients from multiple episodes, we apply them.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad, W2Grad]
# opt_updateGrads 为图的一个操作， 根据传入的一个batch内所有数据对网络参数进行梯度更新
opt_updateGrads = opteration_updata_para(batchGrad)

